# Load TPMs from supplementary files
url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE137001&format=file&file=GSE137001%5FTPM%5Freprogramming%5Fintermediates%2Exlsx"
filename <- "GSE137001_TPM_reprogramming_intermediates.xlsx"
if (!file.exists(filename)) {
  download.file(url, filename)
}
supp_tpms <- read_excel(filename)
# Get run_id, GSM code, sample_name conversion table
run_meta <- read_tsv("run_metadata.txt", show_col_types = FALSE)

# Generate metadata from column names of supplied TPMs
metadata <- tibble(sample_name = colnames(supp_tpms[0, 3:41])) %>%
  mutate(state = case_when(
    startsWith(sample_name, "d") ~ gsub("(d\\d+)_.+", "\\1", sample_name),
    grepl("MEF", sample_name) ~ "MEF",
    grepl("iPSC", sample_name) ~ "iPSC",
    grepl("ESC", sample_name) ~ "ESC"
  )) %>%
  mutate(factors = case_when(
    grepl("OSKM", sample_name) ~ "OSKM",
    grepl("SKM", sample_name) ~ "SKM",
    TRUE ~ "NA"
  )) %>%
  mutate(adhesion = case_when(
    grepl("Epcam", sample_name) ~ "Epcam",
    grepl("GFP", sample_name) ~ "GFP",
    TRUE ~ "NA"
  )) %>%
    mutate(rep = case_when(
    grepl("_R1", sample_name) ~ "1",
    grepl("_R2", sample_name) ~ "2"
  )) %>%
  mutate_if(is.character, as.factor) %>%
  mutate(state = ordered(state,
    levels = c("MEF", "d2", "d4", "d6", "d9", "d12", "iPSC", "ESC")
  ))

# Join inferred metadata with run metadata
metadata <- metadata %>%
  left_join(run_meta, by = "sample_name")

PCA Using TPMs From Supplementary Files

# Log transform data for PCA
tpms_mat <- supp_tpms %>% select(3:41) %>% as.matrix() %>% t()
trans_mat <- log2(1 + (tpms_mat))

pca_res <- stats::prcomp(trans_mat) 
var_explained <- pca_res$sdev^2 / sum(pca_res$sdev^2)

pca_tbl <- pca_res$x %>%
  as.data.frame() %>%
  rownames_to_column("sample_name") %>%
  left_join(metadata, by = c("sample_name"))

pca_tbl %>%
  ggplot(aes(
    x = PC1, y = PC2, color = state, shape = factors
  )) + 
  geom_point(size = 3) +
  geom_text(aes(label = state), hjust = -0.2, vjust = -0.2) +
  xlab(paste("PC1: ", round(var_explained[1] * 100, digits = 2), "%", sep = "")) +
  ylab(paste("PC2: ", round(var_explained[2] * 100, digits = 2), "%", sep = ""))

Get tximport Object From Reprocessed SRAs

tx2sym <- AnnotationDbi::select(
  EnsDb.Mmusculus.v79,
  keys=keys(EnsDb.Mmusculus.v79),
  columns = c("TXNAME", "SYMBOL")
)

run_ids <- list.dirs(
  "reprocess/quants",
  recursive = FALSE,
  full.names = FALSE
)

# Keep only run_ids that exist in supplementary data
run_ids <- run_ids[run_ids %in% metadata$run_id]

paths <- file.path("reprocess/quants", run_ids, "quant.sf")
names(paths) <- run_ids

txi <- tximport(paths,
  type = "salmon",
  tx2gene = tx2sym %>% select(TXNAME, GENEID),
  ignoreTxVersion = TRUE
)
reading in files with read_tsv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
transcripts missing from tx2gene: 15338
summarizing abundance
summarizing counts
summarizing length
dds <- DESeqDataSetFromTximport(
  txi,
  metadata %>% arrange(run_id) %>% column_to_rownames("run_id"),
  ~factors
)
using counts and average transcript lengths from tximport
vsd <- vst(dds, blind = TRUE)
using 'avgTxLength' from assays(dds), correcting for library size

PCA Using Counts From Reprocessed SRAs

# Log transform data for PCA
trans_mat <- assay(vsd) %>% as.matrix() %>% t()

pca_res <- stats::prcomp(trans_mat) 
var_explained <- pca_res$sdev^2 / sum(pca_res$sdev^2)

pca_tbl <- pca_res$x %>%
  as.data.frame() %>%
  rownames_to_column("run_id") %>%
  left_join(metadata, by = c("run_id"))

pca_tbl %>%
  ggplot(aes(
    x = PC1, y = PC2, color = state, shape = factors
  )) + 
  geom_point(size = 3) +
  geom_text(aes(label = state), hjust = -0.2, vjust = -0.2) +
  xlab(paste("PC1: ", round(var_explained[1] * 100, digits = 2), "%", sep = "")) +
  ylab(paste("PC2: ", round(var_explained[2] * 100, digits = 2), "%", sep = ""))

PC1: 39.02%
PC2: 21.48%
PC3: 12.26%

plotly::plot_ly(pca_tbl,
  x = ~PC1, y = ~PC2, z = ~PC3,
  color = ~pca_tbl$state,
  symbol = ~pca_tbl$factors,
  marker = list(size = 10)
) %>%
  plotly::add_markers()
LS0tCnRpdGxlOiAiUENBIENvbXBhcmlzb24gKEdTRTEzNzAwMSkiCmF1dGhvcjogIkphbWVzIERhbyIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgY29kZV9mb2xkaW5nOiBoaWRlCiAgICB0aGVtZTogY2VydWxlYW4KLS0tCgpgYGB7ciBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KEVuc0RiLk1tdXNjdWx1cy52NzkpCmxpYnJhcnkoREVTZXEyKQpsaWJyYXJ5KHR4aW1wb3J0KQpsaWJyYXJ5KHJlYWR4bCkKbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKCmBgYHtyfQojIExvYWQgVFBNcyBmcm9tIHN1cHBsZW1lbnRhcnkgZmlsZXMKdXJsIDwtICJodHRwczovL3d3dy5uY2JpLm5sbS5uaWguZ292L2dlby9kb3dubG9hZC8/YWNjPUdTRTEzNzAwMSZmb3JtYXQ9ZmlsZSZmaWxlPUdTRTEzNzAwMSU1RlRQTSU1RnJlcHJvZ3JhbW1pbmclNUZpbnRlcm1lZGlhdGVzJTJFeGxzeCIKZmlsZW5hbWUgPC0gIkdTRTEzNzAwMV9UUE1fcmVwcm9ncmFtbWluZ19pbnRlcm1lZGlhdGVzLnhsc3giCmlmICghZmlsZS5leGlzdHMoZmlsZW5hbWUpKSB7CiAgZG93bmxvYWQuZmlsZSh1cmwsIGZpbGVuYW1lKQp9CnN1cHBfdHBtcyA8LSByZWFkX2V4Y2VsKGZpbGVuYW1lKQpgYGAKCmBgYHtyfQojIEdldCBydW5faWQsIEdTTSBjb2RlLCBzYW1wbGVfbmFtZSBjb252ZXJzaW9uIHRhYmxlCnJ1bl9tZXRhIDwtIHJlYWRfdHN2KCJydW5fbWV0YWRhdGEudHh0Iiwgc2hvd19jb2xfdHlwZXMgPSBGQUxTRSkKCiMgR2VuZXJhdGUgbWV0YWRhdGEgZnJvbSBjb2x1bW4gbmFtZXMgb2Ygc3VwcGxpZWQgVFBNcwptZXRhZGF0YSA8LSB0aWJibGUoc2FtcGxlX25hbWUgPSBjb2xuYW1lcyhzdXBwX3RwbXNbMCwgMzo0MV0pKSAlPiUKICBtdXRhdGUoc3RhdGUgPSBjYXNlX3doZW4oCiAgICBzdGFydHNXaXRoKHNhbXBsZV9uYW1lLCAiZCIpIH4gZ3N1YigiKGRcXGQrKV8uKyIsICJcXDEiLCBzYW1wbGVfbmFtZSksCiAgICBncmVwbCgiTUVGIiwgc2FtcGxlX25hbWUpIH4gIk1FRiIsCiAgICBncmVwbCgiaVBTQyIsIHNhbXBsZV9uYW1lKSB+ICJpUFNDIiwKICAgIGdyZXBsKCJFU0MiLCBzYW1wbGVfbmFtZSkgfiAiRVNDIgogICkpICU+JQogIG11dGF0ZShmYWN0b3JzID0gY2FzZV93aGVuKAogICAgZ3JlcGwoIk9TS00iLCBzYW1wbGVfbmFtZSkgfiAiT1NLTSIsCiAgICBncmVwbCgiU0tNIiwgc2FtcGxlX25hbWUpIH4gIlNLTSIsCiAgICBUUlVFIH4gIk5BIgogICkpICU+JQogIG11dGF0ZShhZGhlc2lvbiA9IGNhc2Vfd2hlbigKICAgIGdyZXBsKCJFcGNhbSIsIHNhbXBsZV9uYW1lKSB+ICJFcGNhbSIsCiAgICBncmVwbCgiR0ZQIiwgc2FtcGxlX25hbWUpIH4gIkdGUCIsCiAgICBUUlVFIH4gIk5BIgogICkpICU+JQogICAgbXV0YXRlKHJlcCA9IGNhc2Vfd2hlbigKICAgIGdyZXBsKCJfUjEiLCBzYW1wbGVfbmFtZSkgfiAiMSIsCiAgICBncmVwbCgiX1IyIiwgc2FtcGxlX25hbWUpIH4gIjIiCiAgKSkgJT4lCiAgbXV0YXRlX2lmKGlzLmNoYXJhY3RlciwgYXMuZmFjdG9yKSAlPiUKICBtdXRhdGUoc3RhdGUgPSBvcmRlcmVkKHN0YXRlLAogICAgbGV2ZWxzID0gYygiTUVGIiwgImQyIiwgImQ0IiwgImQ2IiwgImQ5IiwgImQxMiIsICJpUFNDIiwgIkVTQyIpCiAgKSkKCiMgSm9pbiBpbmZlcnJlZCBtZXRhZGF0YSB3aXRoIHJ1biBtZXRhZGF0YQptZXRhZGF0YSA8LSBtZXRhZGF0YSAlPiUKICBsZWZ0X2pvaW4ocnVuX21ldGEsIGJ5ID0gInNhbXBsZV9uYW1lIikKYGBgCgojIFBDQSBVc2luZyBUUE1zIEZyb20gU3VwcGxlbWVudGFyeSBGaWxlcwoKYGBge3J9CiMgTG9nIHRyYW5zZm9ybSBkYXRhIGZvciBQQ0EKdHBtc19tYXQgPC0gc3VwcF90cG1zICU+JSBzZWxlY3QoMzo0MSkgJT4lIGFzLm1hdHJpeCgpICU+JSB0KCkKdHJhbnNfbWF0IDwtIGxvZzIoMSArICh0cG1zX21hdCkpCgpwY2FfcmVzIDwtIHN0YXRzOjpwcmNvbXAodHJhbnNfbWF0KSAKdmFyX2V4cGxhaW5lZCA8LSBwY2FfcmVzJHNkZXZeMiAvIHN1bShwY2FfcmVzJHNkZXZeMikKCnBjYV90YmwgPC0gcGNhX3JlcyR4ICU+JQogIGFzLmRhdGEuZnJhbWUoKSAlPiUKICByb3duYW1lc190b19jb2x1bW4oInNhbXBsZV9uYW1lIikgJT4lCiAgbGVmdF9qb2luKG1ldGFkYXRhLCBieSA9IGMoInNhbXBsZV9uYW1lIikpCgpwY2FfdGJsICU+JQogIGdncGxvdChhZXMoCiAgICB4ID0gUEMxLCB5ID0gUEMyLCBjb2xvciA9IHN0YXRlLCBzaGFwZSA9IGZhY3RvcnMKICApKSArIAogIGdlb21fcG9pbnQoc2l6ZSA9IDMpICsKICBnZW9tX3RleHQoYWVzKGxhYmVsID0gc3RhdGUpLCBoanVzdCA9IC0wLjIsIHZqdXN0ID0gLTAuMikgKwogIHhsYWIocGFzdGUoIlBDMTogIiwgcm91bmQodmFyX2V4cGxhaW5lZFsxXSAqIDEwMCwgZGlnaXRzID0gMiksICIlIiwgc2VwID0gIiIpKSArCiAgeWxhYihwYXN0ZSgiUEMyOiAiLCByb3VuZCh2YXJfZXhwbGFpbmVkWzJdICogMTAwLCBkaWdpdHMgPSAyKSwgIiUiLCBzZXAgPSAiIikpCmBgYAoKIyBHZXQgdHhpbXBvcnQgT2JqZWN0IEZyb20gUmVwcm9jZXNzZWQgU1JBcwoKYGBge3J9CnR4MnN5bSA8LSBBbm5vdGF0aW9uRGJpOjpzZWxlY3QoCiAgRW5zRGIuTW11c2N1bHVzLnY3OSwKICBrZXlzPWtleXMoRW5zRGIuTW11c2N1bHVzLnY3OSksCiAgY29sdW1ucyA9IGMoIlRYTkFNRSIsICJTWU1CT0wiKQopCgpydW5faWRzIDwtIGxpc3QuZGlycygKICAicmVwcm9jZXNzL3F1YW50cyIsCiAgcmVjdXJzaXZlID0gRkFMU0UsCiAgZnVsbC5uYW1lcyA9IEZBTFNFCikKCiMgS2VlcCBvbmx5IHJ1bl9pZHMgdGhhdCBleGlzdCBpbiBzdXBwbGVtZW50YXJ5IGRhdGEKcnVuX2lkcyA8LSBydW5faWRzW3J1bl9pZHMgJWluJSBtZXRhZGF0YSRydW5faWRdCgpwYXRocyA8LSBmaWxlLnBhdGgoInJlcHJvY2Vzcy9xdWFudHMiLCBydW5faWRzLCAicXVhbnQuc2YiKQpuYW1lcyhwYXRocykgPC0gcnVuX2lkcwoKdHhpIDwtIHR4aW1wb3J0KHBhdGhzLAogIHR5cGUgPSAic2FsbW9uIiwKICB0eDJnZW5lID0gdHgyc3ltICU+JSBzZWxlY3QoVFhOQU1FLCBHRU5FSUQpLAogIGlnbm9yZVR4VmVyc2lvbiA9IFRSVUUKKQpgYGAKCmBgYHtyfQpkZHMgPC0gREVTZXFEYXRhU2V0RnJvbVR4aW1wb3J0KAogIHR4aSwKICBtZXRhZGF0YSAlPiUgYXJyYW5nZShydW5faWQpICU+JSBjb2x1bW5fdG9fcm93bmFtZXMoInJ1bl9pZCIpLAogIH5mYWN0b3JzCikKCnZzZCA8LSB2c3QoZGRzLCBibGluZCA9IFRSVUUpCmBgYAoKIyBQQ0EgVXNpbmcgQ291bnRzIEZyb20gUmVwcm9jZXNzZWQgU1JBcwoKYGBge3J9CiMgTG9nIHRyYW5zZm9ybSBkYXRhIGZvciBQQ0EKdHJhbnNfbWF0IDwtIGFzc2F5KHZzZCkgJT4lIGFzLm1hdHJpeCgpICU+JSB0KCkKCnBjYV9yZXMgPC0gc3RhdHM6OnByY29tcCh0cmFuc19tYXQpIAp2YXJfZXhwbGFpbmVkIDwtIHBjYV9yZXMkc2Rldl4yIC8gc3VtKHBjYV9yZXMkc2Rldl4yKQoKcGNhX3RibCA8LSBwY2FfcmVzJHggJT4lCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIHJvd25hbWVzX3RvX2NvbHVtbigicnVuX2lkIikgJT4lCiAgbGVmdF9qb2luKG1ldGFkYXRhLCBieSA9IGMoInJ1bl9pZCIpKQoKcGNhX3RibCAlPiUKICBnZ3Bsb3QoYWVzKAogICAgeCA9IFBDMSwgeSA9IFBDMiwgY29sb3IgPSBzdGF0ZSwgc2hhcGUgPSBmYWN0b3JzCiAgKSkgKyAKICBnZW9tX3BvaW50KHNpemUgPSAzKSArCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IHN0YXRlKSwgaGp1c3QgPSAtMC4yLCB2anVzdCA9IC0wLjIpICsKICB4bGFiKHBhc3RlKCJQQzE6ICIsIHJvdW5kKHZhcl9leHBsYWluZWRbMV0gKiAxMDAsIGRpZ2l0cyA9IDIpLCAiJSIsIHNlcCA9ICIiKSkgKwogIHlsYWIocGFzdGUoIlBDMjogIiwgcm91bmQodmFyX2V4cGxhaW5lZFsyXSAqIDEwMCwgZGlnaXRzID0gMiksICIlIiwgc2VwID0gIiIpKQpgYGAKCmByIHBhc3RlKCJQQzE6ICIsIHJvdW5kKHZhcl9leHBsYWluZWRbMV0gKiAxMDAsIGRpZ2l0cyA9IDIpLCAiJSIsIHNlcCA9ICIiKWAgIApgciBwYXN0ZSgiUEMyOiAiLCByb3VuZCh2YXJfZXhwbGFpbmVkWzJdICogMTAwLCBkaWdpdHMgPSAyKSwgIiUiLCBzZXAgPSAiIilgICAKYHIgcGFzdGUoIlBDMzogIiwgcm91bmQodmFyX2V4cGxhaW5lZFszXSAqIDEwMCwgZGlnaXRzID0gMiksICIlIiwgc2VwID0gIiIpYAoKYGBge3IgZmlnLmhlaWdodD03LCBmaWcud2lkdGg9OH0KcGxvdGx5OjpwbG90X2x5KHBjYV90YmwsCiAgeCA9IH5QQzEsIHkgPSB+UEMyLCB6ID0gflBDMywKICBjb2xvciA9IH5wY2FfdGJsJHN0YXRlLAogIHN5bWJvbCA9IH5wY2FfdGJsJGZhY3RvcnMsCiAgbWFya2VyID0gbGlzdChzaXplID0gMTApCikgJT4lCiAgcGxvdGx5OjphZGRfbWFya2VycygpCmBgYAoKCgoKCgoKCgoKCgoKCg==